0
Install R
Install RStudio
Install Java
Download a Very Large dataset: https://www.dropbox.com/s/5a210w7zhm5a97p/alleen_personenauto.csv.zip?dl=0
Git repository git clone https://github.com/ArieTwigt/sparklyrintro.git
“Unified analytics engine for large-scale data processing”
Developed by Matei Zaharia at UC Berkeley’s AMPLab in 2009
Donated in 2012 to the Apache Software foundation –> Apache Spark
Founders of Apache Spark – > Databricks
Otherwise:
Python:
R:
Companies dealing with very large amounts of data…
Local mode or Cluster mode
Spark is written in Scala –> Lives in a JVM
Scala, Java, Python, R
*** API’s? ***
R: Statistical programming language: Introduction course/book: https://growyourdataskills.com/r_programmeren/
R and Spark
Introduction with examples:
https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html
Spark + dplyr = sparklyR
Working with large datasets in the dplyr syntax.
** Spark has never been that easy**
Cheatsheets available:
Open R and RStudio –> Create a new project
Install the sparklyr package in R
install.packages("sparklyr")
Open the sparklyr package
library(sparklyr)
Check the available Spark versions
versions <- spark_available_versions()$spark
print(versions)
## [1] "1.6.3" "1.6.2" "1.6.1" "1.6.0" "2.0.0" "2.0.1" "2.0.2" "2.1.0"
## [9] "2.1.1" "2.2.0" "2.2.1" "2.3.0" "2.3.1" "2.3.2"
Install the latest spark version
latest_version <- tail(versions, 1)
print(paste0("Installing Spark version: ", latest_version))
spark_install(version="2.3.1")
Configurations
sc <- spark_connect(master = "local",
app_name = "ariespark")
If this works, you got Spark installed!
You also get this nice little Spark pane in RStudio
Other configurations:
library(parallel)
detectCores()
## [1] 4
Check configuration
config <- spark_config()
config
Change configuration
config$sparklyr.connect.cores.local <- 4
config$`sparklyr.shell.driver-memory` <- "2g"
config$`sparklyr.shell.executor-memory` <- "2g"
config$`spark.yarn.executor.memoryOverhead` <- "1g"
config
Resilient distributed dataset (RDD)
“A collection of elements partitioned across the nodes of the cluster that can be operated on in parallel.”
Immutable
Two types of operations on RDD’s:
You don’t care in sparklyr because you just use the dplyr syntax 😎
But you should monitor the performance.
Monitor what is sparklyr doing with Spark under the hood:
Read large csv-files in R faster with data.table:
import data into RStudio
library(readr)
alleen_personenauto_filtered <- read_csv("data/alleen_personenauto_filtered.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Bruto BPM` = col_integer(),
## `Aantal zitplaatsen` = col_integer(),
## `Aantal cilinders` = col_integer(),
## Cilinderinhoud = col_integer(),
## `Massa ledig voertuig` = col_integer(),
## `Toegestane maximum massa voertuig` = col_integer(),
## `Massa rijklaar` = col_integer(),
## `Maximum massa trekken ongeremd` = col_integer(),
## `Maximum trekken massa geremd` = col_integer(),
## Catalogusprijs = col_integer(),
## `Aantal deuren` = col_integer(),
## `Aantal wielen` = col_integer(),
## `Afstand hart koppeling tot achterzijde voertuig` = col_integer(),
## `Afstand voorzijde voertuig tot hart koppeling` = col_integer(),
## Lengte = col_integer(),
## Breedte = col_integer(),
## `Technische max. massa voertuig` = col_integer(),
## `Volgnummer wijziging EU typegoedkeuring` = col_integer(),
## `Vermogen massarijklaar` = col_double(),
## Wielbasis = col_integer()
## # ... with 1 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 2)
## Warning: 1 parsing failure.
## row # A tibble: 1 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 86755 Cataloguspr~ no trailing char~ e+05 'data/alleen_personenauto_f~ file # A tibble: 1 x 5
Check the dataset
dim(alleen_personenauto_filtered)
## [1] 115403 48
copy_to() from dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
start <- Sys.time()
copy_to(sc, alleen_personenauto_filtered)
## # Source: spark<alleen_personenauto_filtered> [?? x 48]
## Kenteken Voertuigsoort Merk Handelsbenaming Vervaldatum_APK
## * <chr> <chr> <chr> <chr> <chr>
## 1 72ZDSP Personenauto VOLK~ GOLF 13/06/2019
## 2 43NGLJ Personenauto OPEL CORSA-C 02/01/2019
## 3 HVGD51 Personenauto TOYO~ STARLET 1.3 E2 26/02/2019
## 4 JT971V Personenauto AUDI A3 SPORTBACK 24/06/2020
## 5 51SGF9 Personenauto TOYO~ TOYOTA YARIS 12/09/2020
## 6 95LNXV Personenauto FIAT FIAT PUNTO 17/12/2018
## 7 JT411J Personenauto VOLVO V70 20/06/2019
## 8 TDFN97 Personenauto PEUG~ 306 11/07/2019
## 9 53PLTN Personenauto DAEW~ KALOS 06/03/2019
## 10 JT748X Personenauto NISS~ NISSAN MICRA 24/06/2020
## # ... with more rows, and 43 more variables: Datum_tenaamstelling <chr>,
## # Bruto_BPM <int>, Inrichting <chr>, Aantal_zitplaatsen <int>,
## # Eerste_kleur <chr>, Tweede_kleur <chr>, Aantal_cilinders <int>,
## # Cilinderinhoud <int>, Massa_ledig_voertuig <int>,
## # Toegestane_maximum_massa_voertuig <int>, Massa_rijklaar <int>,
## # Maximum_massa_trekken_ongeremd <int>,
## # Maximum_trekken_massa_geremd <int>, Retrofit_roetfilter <chr>,
## # Zuinigheidslabel <chr>, Datum_eerste_toelating <chr>,
## # Datum_eerste_afgifte_Nederland <chr>, Wacht_op_keuren <chr>,
## # Catalogusprijs <int>, WAM_verzekerd <chr>, Aantal_deuren <int>,
## # Aantal_wielen <int>,
## # Afstand_hart_koppeling_tot_achterzijde_voertuig <int>,
## # Afstand_voorzijde_voertuig_tot_hart_koppeling <int>, Lengte <int>,
## # Breedte <int>, Europese_voertuigcategorie <chr>,
## # Plaats_chassisnummer <chr>, Technische_max_massa_voertuig <int>,
## # Type <chr>, Type_gasinstallatie <chr>, Typegoedkeuringsnummer <chr>,
## # Variant <chr>, Uitvoering <chr>,
## # Volgnummer_wijziging_EU_typegoedkeuring <int>,
## # Vermogen_massarijklaar <dbl>, Wielbasis <int>, Export_indicator <chr>,
## # Openstaande_terugroepactie_indicator <chr>,
## # Vervaldatum_tachograaf <chr>, jaar <int>, maan <chr>, maand <chr>
end <- Sys.time()
end - start
## Time difference of 10.67502 secs
Data wrangling with dplyr
Notice the %>% piping
auto_merken <- alleen_personenauto_filtered %>%
group_by(Merk) %>%
summarise(aantal = n()) %>%
filter(aantal > 100) %>%
arrange(desc(aantal))
library(ggplot2)
autos_merk_plot <- qplot(Merk, data=head(auto_merken, 25), geom="bar", weight=aantal, ylab="aantal") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
autos_merk_plot
auto_carrosserie <- alleen_personenauto_filtered %>%
group_by(Inrichting) %>%
summarise(aantal = n()) %>%
filter(aantal > 100) %>%
arrange(aantal, desc(-aantal))
autos_carrosserie_plot <- qplot(Inrichting, data=tail(auto_carrosserie, 50), geom="bar", weight=aantal, ylab="aantal") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
autos_carrosserie_plot
End of Part 1
Go Play :) with the dplyr syntax. In the next part we are going to play with some larger dataset and some prediction models.
Clear your environment and restart R
Session –> Terminate R
If there are still elements in the Environment tab, execute:
remove(list = ls())
library(dplyr)
library(sparklyr)
This time, we want some more power. We are going to overrule the default settings of the Spark Config
config = spark_config()
config$`sparklyr.shell.driver-memory` <- "6G"
config$`sparklyr.shell.executor-memory` <- "6G"
config$`spark.yarn.executor.memoryOverhead` <- "2G"
We set up a Spark connection
sc <- spark_connect("local",
# version = "2.1.0", # Uncomment if you like to choose a specific version
config = config)
Import the real dataset, 2.96 GB in-memory😳😳😳. This time, we use the spark_read_csv() function. This will import the dataset as an RDD in Spark and not in R. It will take a few minutes and you will hear your machine putting in some serious work 💻
start <- Sys.time()
spark_read_csv(sc,
name = "alleen_personenauto",
path = "data/alleen_personenauto.csv")
## # Source: spark<alleen_personenauto> [?? x 45]
## Kenteken Voertuigsoort Merk Handelsbenaming Vervaldatum_APK
## * <chr> <chr> <chr> <chr> <chr>
## 1 68JTJN Personenauto RENA~ MEGANE CLASSIC 14/07/2019
## 2 66JGSB Personenauto FORD MONDEO 15/03/2019
## 3 42LGJG Personenauto FORD FOCUS 03/02/2019
## 4 HD995K Personenauto FORD FIESTA 16/10/2019
## 5 TVRZ97 Personenauto VOLVO V70 11/05/2019
## 6 9TDX21 Personenauto VOLK~ GOLF 27/02/2020
## 7 62RKFH Personenauto AUDI AUDI A6 19/01/2019
## 8 98XNXH Personenauto FORD KA 31/07/2019
## 9 82LDXZ Personenauto FORD FIESTA 09/03/2019
## 10 59PPP4 Personenauto KIA PICANTO 01/09/2019
## # ... with more rows, and 40 more variables: Datum_tenaamstelling <chr>,
## # Bruto_BPM <chr>, Inrichting <chr>, Aantal_zitplaatsen <int>,
## # Eerste_kleur <chr>, Tweede_kleur <chr>, Aantal_cilinders <int>,
## # Cilinderinhoud <chr>, Massa_ledig_voertuig <chr>,
## # Toegestane_maximum_massa_voertuig <chr>, Massa_rijklaar <chr>,
## # Maximum_massa_trekken_ongeremd <chr>,
## # Maximum_trekken_massa_geremd <chr>, Retrofit_roetfilter <chr>,
## # Zuinigheidslabel <chr>, Datum_eerste_toelating <chr>,
## # Datum_eerste_afgifte_Nederland <chr>, Wacht_op_keuren <chr>,
## # Catalogusprijs <chr>, WAM_verzekerd <chr>, Aantal_deuren <int>,
## # Aantal_wielen <int>,
## # Afstand_hart_koppeling_tot_achterzijde_voertuig <int>,
## # Afstand_voorzijde_voertuig_tot_hart_koppeling <int>, Lengte <chr>,
## # Breedte <int>, Europese_voertuigcategorie <chr>,
## # Plaats_chassisnummer <chr>, Technische_max_massa_voertuig <chr>,
## # Type <chr>, Type_gasinstallatie <chr>, Typegoedkeuringsnummer <chr>,
## # Variant <chr>, Uitvoering <chr>,
## # Volgnummer_wijziging_EU_typegoedkeuring <int>,
## # Vermogen_massarijklaar <dbl>, Wielbasis <chr>, Export_indicator <chr>,
## # Openstaande_terugroepactie_indicator <chr>,
## # Vervaldatum_tachograaf <chr>
end <- Sys.time()
Recall that the object alleen_personenauto is not present in the Environment tab in RStudio, but only in the Connections tab. We loaded the data directly from the csv-file into Spark.
Question: what is the advantage of this approach?
Is it really not available in R??
dim(alleen_personenauto)
Yes, it is really not available.
It is in Spark:
src_tbls(sc)
## [1] "alleen_personenauto"
Check the Spark UI: http://localhost:4040
There is our dataset. Check the Cached Partitions value.
Or data is divided into 23 smaller sets. Hence the name Resilient Distributed Dataset –> Distributed.
How did Spark do this? Check out the Jobs tab in the Spark UI.
Time to work with the data. Use our R skills to work with the large dataset that currently lives in Spark.
Read into R with the tbl() function.
tbl_autos <- tbl(sc, "alleen_personenauto")
Wow that was fast!!!! Almost 3 GB loaded from Spark into the RSession. … … … Wait ……. did we really load the data into R? 🤔
tbl_autos
## # Source: spark<alleen_personenauto> [?? x 45]
## Kenteken Voertuigsoort Merk Handelsbenaming Vervaldatum_APK
## * <chr> <chr> <chr> <chr> <chr>
## 1 68JTJN Personenauto RENA~ MEGANE CLASSIC 14/07/2019
## 2 66JGSB Personenauto FORD MONDEO 15/03/2019
## 3 42LGJG Personenauto FORD FOCUS 03/02/2019
## 4 HD995K Personenauto FORD FIESTA 16/10/2019
## 5 TVRZ97 Personenauto VOLVO V70 11/05/2019
## 6 9TDX21 Personenauto VOLK~ GOLF 27/02/2020
## 7 62RKFH Personenauto AUDI AUDI A6 19/01/2019
## 8 98XNXH Personenauto FORD KA 31/07/2019
## 9 82LDXZ Personenauto FORD FIESTA 09/03/2019
## 10 59PPP4 Personenauto KIA PICANTO 01/09/2019
## # ... with more rows, and 40 more variables: Datum_tenaamstelling <chr>,
## # Bruto_BPM <chr>, Inrichting <chr>, Aantal_zitplaatsen <int>,
## # Eerste_kleur <chr>, Tweede_kleur <chr>, Aantal_cilinders <int>,
## # Cilinderinhoud <chr>, Massa_ledig_voertuig <chr>,
## # Toegestane_maximum_massa_voertuig <chr>, Massa_rijklaar <chr>,
## # Maximum_massa_trekken_ongeremd <chr>,
## # Maximum_trekken_massa_geremd <chr>, Retrofit_roetfilter <chr>,
## # Zuinigheidslabel <chr>, Datum_eerste_toelating <chr>,
## # Datum_eerste_afgifte_Nederland <chr>, Wacht_op_keuren <chr>,
## # Catalogusprijs <chr>, WAM_verzekerd <chr>, Aantal_deuren <int>,
## # Aantal_wielen <int>,
## # Afstand_hart_koppeling_tot_achterzijde_voertuig <int>,
## # Afstand_voorzijde_voertuig_tot_hart_koppeling <int>, Lengte <chr>,
## # Breedte <int>, Europese_voertuigcategorie <chr>,
## # Plaats_chassisnummer <chr>, Technische_max_massa_voertuig <chr>,
## # Type <chr>, Type_gasinstallatie <chr>, Typegoedkeuringsnummer <chr>,
## # Variant <chr>, Uitvoering <chr>,
## # Volgnummer_wijziging_EU_typegoedkeuring <int>,
## # Vermogen_massarijklaar <dbl>, Wielbasis <chr>, Export_indicator <chr>,
## # Openstaande_terugroepactie_indicator <chr>,
## # Vervaldatum_tachograaf <chr>
Yes, we got the data right?
Let’s filter the data like we did with the smaller dataset.
audis <- tbl_autos %>%
filter(Merk == "AUDI")
No problem, we got all the Audi’s now.
head(audis)
## # Source: spark<?> [?? x 45]
## Kenteken Voertuigsoort Merk Handelsbenaming Vervaldatum_APK
## * <chr> <chr> <chr> <chr> <chr>
## 1 62RKFH Personenauto AUDI AUDI A6 19/01/2019
## 2 HV186F Personenauto AUDI A4 LIMOUSINE 19/02/2020
## 3 13GVGL Personenauto AUDI AUDI A4 <NA>
## 4 RK608N Personenauto AUDI Q3 19/12/2021
## 5 30THL5 Personenauto AUDI A5 CABRIOLET 01/02/2020
## 6 GX300G Personenauto AUDI AUDI A4 <NA>
## # ... with 40 more variables: Datum_tenaamstelling <chr>, Bruto_BPM <chr>,
## # Inrichting <chr>, Aantal_zitplaatsen <int>, Eerste_kleur <chr>,
## # Tweede_kleur <chr>, Aantal_cilinders <int>, Cilinderinhoud <chr>,
## # Massa_ledig_voertuig <chr>, Toegestane_maximum_massa_voertuig <chr>,
## # Massa_rijklaar <chr>, Maximum_massa_trekken_ongeremd <chr>,
## # Maximum_trekken_massa_geremd <chr>, Retrofit_roetfilter <chr>,
## # Zuinigheidslabel <chr>, Datum_eerste_toelating <chr>,
## # Datum_eerste_afgifte_Nederland <chr>, Wacht_op_keuren <chr>,
## # Catalogusprijs <chr>, WAM_verzekerd <chr>, Aantal_deuren <int>,
## # Aantal_wielen <int>,
## # Afstand_hart_koppeling_tot_achterzijde_voertuig <int>,
## # Afstand_voorzijde_voertuig_tot_hart_koppeling <int>, Lengte <chr>,
## # Breedte <int>, Europese_voertuigcategorie <chr>,
## # Plaats_chassisnummer <chr>, Technische_max_massa_voertuig <chr>,
## # Type <chr>, Type_gasinstallatie <chr>, Typegoedkeuringsnummer <chr>,
## # Variant <chr>, Uitvoering <chr>,
## # Volgnummer_wijziging_EU_typegoedkeuring <int>,
## # Vermogen_massarijklaar <dbl>, Wielbasis <chr>, Export_indicator <chr>,
## # Openstaande_terugroepactie_indicator <chr>,
## # Vervaldatum_tachograaf <chr>
So, show me the average price
mean(as.numeric(audis$Catalogusprijs), na.rm = TRUE)
## [1] NaN
“Not a Number”??
Just show me a column than?
audis$Kenteken
## NULL
NULL? 😨 What is this kind of crappy data frame?
class(audis)
## [1] "tbl_spark" "tbl_sql" "tbl_lazy" "tbl"
–> tbl_lazy. —> Resilient. It will execute only if we tell him to execute something.
Execute Spark –> Now spark will work for you. You will hear your machine work again. We collect the audis set and saved is as an R data frame object.
audis_df <- collect(audis)
mean(as.numeric(audis_df$Catalogusprijs), na.rm = TRUE)
## Warning in mean(as.numeric(audis_df$Catalogusprijs), na.rm = TRUE): NAs
## introduced by coercion
## [1] 481.5
Now we have our price. Wait… that is some crappy format, let’s work on that later.
Let’s get all the station cars:
audis_stations_df <- audis_df %>%
filter(Inrichting == "stationwagen")
Question, are we using Spark or regular R here?
mycar <- tbl_autos %>%
filter(Kenteken == "TB725F")
mycar_df <- collect(mycar)
Check available features
str(audis_stations_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 125811 obs. of 45 variables:
## $ Kenteken : chr "62RKFH" "RK608N" "GX300G" "76FGRL" ...
## $ Voertuigsoort : chr "Personenauto" "Personenauto" "Personenauto" "Personenauto" ...
## $ Merk : chr "AUDI" "AUDI" "AUDI" "AUDI" ...
## $ Handelsbenaming : chr "AUDI A6" "Q3" "AUDI A4" "8D AUDI A4" ...
## $ Vervaldatum_APK : chr "19/01/2019" "19/12/2021" NA "22/12/2018" ...
## $ Datum_tenaamstelling : chr "08/07/2017" "19/12/2017" "11/04/2017" "08/06/2013" ...
## $ Bruto_BPM : chr "20,771" "6,861" "2,836" "7,750" ...
## $ Inrichting : chr "stationwagen" "stationwagen" "stationwagen" "stationwagen" ...
## $ Aantal_zitplaatsen : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Eerste_kleur : chr "ZWART" "BLAUW" "ZWART" "BLAUW" ...
## $ Tweede_kleur : chr "Niet geregistreerd" "Niet geregistreerd" "Niet geregistreerd" "Niet geregistreerd" ...
## $ Aantal_cilinders : int 6 4 4 4 4 6 4 4 4 6 ...
## $ Cilinderinhoud : chr "3,123" "1,395" "1,968" "1,781" ...
## $ Massa_ledig_voertuig : chr "1,715" "1,380" "1,495" "1,260" ...
## $ Toegestane_maximum_massa_voertuig : chr "2,370" "2,005" "2,090" "1,835" ...
## $ Massa_rijklaar : chr "1,815" "1,480" "1,595" "1,360" ...
## $ Maximum_massa_trekken_ongeremd : chr "750" "740" "750" "680" ...
## $ Maximum_trekken_massa_geremd : chr "1,900" "1,800" "1,600" "1,300" ...
## $ Retrofit_roetfilter : chr NA NA "Nee" NA ...
## $ Zuinigheidslabel : chr "E" "C" "B" NA ...
## $ Datum_eerste_toelating : chr "28/04/2005" "19/12/2017" "23/01/2009" "10/03/2000" ...
## $ Datum_eerste_afgifte_Nederland : chr "28/04/2005" "19/12/2017" "01/09/2015" "10/03/2000" ...
## $ Wacht_op_keuren : chr "Geen verstrekking in Open Data" "Geen verstrekking in Open Data" "Geen verstrekking in Open Data" "Geen verstrekking in Open Data" ...
## $ Catalogusprijs : chr NA "44,683" "51,376" NA ...
## $ WAM_verzekerd : chr "Ja" "Ja" "Ja" "Ja" ...
## $ Aantal_deuren : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Aantal_wielen : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Afstand_hart_koppeling_tot_achterzijde_voertuig: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Afstand_voorzijde_voertuig_tot_hart_koppeling : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Lengte : chr "0" "0" "0" "0" ...
## $ Breedte : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Europese_voertuigcategorie : chr "M1" "M1" "M1" "M1" ...
## $ Plaats_chassisnummer : chr "midden tegen schutbord onder motorkap" "in motorruimte r." "op r. schroefveerkoker onder motorkap" "midden tegen schutbord onder motorkap" ...
## $ Technische_max_massa_voertuig : chr "2,370" "2,005" "2,090" "1,835" ...
## $ Type : chr NA "8U" "B8" NA ...
## $ Type_gasinstallatie : chr NA NA NA NA ...
## $ Typegoedkeuringsnummer : chr "e1*2001/116*0254*05" "e1*2007/46*0591*19" "e1*2001/116*0430*10" "e1*98/14*0013*18" ...
## $ Variant : chr "AAUKQ1" "XCZEAF1" "ACAHAF1" "AAPTF1" ...
## $ Uitvoering : chr "QA6LC0R4F600GGR" "FD6FD62E027P8U027MMEM1" "FM6B1002R8K617MG" "FM52DMR92M" ...
## $ Volgnummer_wijziging_EU_typegoedkeuring : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Vermogen_massarijklaar : num 0.1 0.07 0.08 0.07 0.08 0.08 0.08 0.09 0.06 0.14 ...
## $ Wielbasis : chr "286" "260" "282" "260" ...
## $ Export_indicator : chr "Nee" "Nee" "Ja" "Nee" ...
## $ Openstaande_terugroepactie_indicator : chr "Nee" "Nee" "Nee" "Nee" ...
## $ Vervaldatum_tachograaf : chr NA NA NA NA ...
audis_clean <- audis %>%
mutate(Catalogusprijs = regexp_replace(Catalogusprijs, ",", ".")) %>% # regex
mutate(Catalogusprijs = as.numeric(Catalogusprijs)) %>% # convert columns
select(Catalogusprijs, Aantal_cilinders) %>% # select columns
na.omit() # remove rows with empty values ('NA', 'NaN' etc.)
## * Dropped 108201 rows with 'na.omit' (297347 => 189146)
Question, are we using Spark or regular R here?
lm_model_1 <- audis_clean %>%
ml_linear_regression(Catalogusprijs ~ Aantal_cilinders)
Show model performance
summary(lm_model_1)
## Deviance Residuals (approximate):
## Min 1Q Median 3Q Max
## -127.0589 -9.7894 -0.7141 8.1839 559.5385
##
## Coefficients:
## (Intercept) Aantal_cilinders
## -52.18171 24.39120
##
## R-Squared: 0.6417
## Root Mean Squared Error: 15.91
New model, importing cars cheaper?
audis_clean <- audis %>%
mutate(Catalogusprijs = regexp_replace(Catalogusprijs, ",", ".")) %>% # regex
mutate(Catalogusprijs = as.numeric(Catalogusprijs)) %>% # convert columns
select(Catalogusprijs, Aantal_cilinders, Export_indicator) %>% # select columns
na.omit() # remove rows with empty values ('NA', 'NaN' etc.)
## * Dropped 108201 rows with 'na.omit' (297347 => 189146)
lm_model_2 <- audis_clean %>%
ml_linear_regression(Catalogusprijs ~ Aantal_cilinders + Export_indicator)
Show model performance
summary(lm_model_2)
## Deviance Residuals (approximate):
## Min 1Q Median 3Q Max
## -104.9641 -9.8738 -0.7618 8.2102 809.5642
##
## Coefficients:
## (Intercept) Aantal_cilinders Export_indicator_Nee
## -53.1181662 24.4010402 0.9497815
##
## R-Squared: 0.6418
## Root Mean Squared Error: 15.91
ferraris_maseratis <- tbl_autos %>%
filter(Merk %in% c("FERRARI", "MASERATI"))